library(ggplot2)
library(dplyr)
library(scales)
library(tidyr)
library(rstudioapi)
YLT: I changed this part, now u can just run and the path is set automatically
main_dir<- dirname(dirname(rstudioapi::getSourceEditorContext()$path))
datadir<- paste0(main_dir,"/data")
data_path<- paste0(datadir,"/data.csv")
df=read.csv(data_path)
head(df)
## stn_code sampling_date state location agency
## 1 150 February - M021990 Andhra Pradesh Hyderabad <NA>
## 2 151 February - M021990 Andhra Pradesh Hyderabad <NA>
## 3 152 February - M021990 Andhra Pradesh Hyderabad <NA>
## 4 150 March - M031990 Andhra Pradesh Hyderabad <NA>
## 5 151 March - M031990 Andhra Pradesh Hyderabad <NA>
## 6 152 March - M031990 Andhra Pradesh Hyderabad <NA>
## type so2 no2 rspm spm
## 1 Residential, Rural and other Areas 4.8 17.4 NA NA
## 2 Industrial Area 3.1 7.0 NA NA
## 3 Residential, Rural and other Areas 6.2 28.5 NA NA
## 4 Residential, Rural and other Areas 6.3 14.7 NA NA
## 5 Industrial Area 4.7 7.5 NA NA
## 6 Residential, Rural and other Areas 6.4 25.7 NA NA
## location_monitoring_station pm2_5 date
## 1 <NA> NA 1990-02-01
## 2 <NA> NA 1990-02-01
## 3 <NA> NA 1990-02-01
## 4 <NA> NA 1990-03-01
## 5 <NA> NA 1990-03-01
## 6 <NA> NA 1990-03-01
str(df)
## 'data.frame': 435742 obs. of 13 variables:
## $ stn_code : chr "150" "151" "152" "150" ...
## $ sampling_date : chr "February - M021990" "February - M021990" "February - M021990" "March - M031990" ...
## $ state : chr "Andhra Pradesh" "Andhra Pradesh" "Andhra Pradesh" "Andhra Pradesh" ...
## $ location : chr "Hyderabad" "Hyderabad" "Hyderabad" "Hyderabad" ...
## $ agency : chr NA NA NA NA ...
## $ type : chr "Residential, Rural and other Areas" "Industrial Area" "Residential, Rural and other Areas" "Residential, Rural and other Areas" ...
## $ so2 : num 4.8 3.1 6.2 6.3 4.7 6.4 5.4 4.7 4.2 4 ...
## $ no2 : num 17.4 7 28.5 14.7 7.5 25.7 17.1 8.7 23 8.9 ...
## $ rspm : num NA NA NA NA NA NA NA NA NA NA ...
## $ spm : num NA NA NA NA NA NA NA NA NA NA ...
## $ location_monitoring_station: chr NA NA NA NA ...
## $ pm2_5 : num NA NA NA NA NA NA NA NA NA NA ...
## $ date : chr "1990-02-01" "1990-02-01" "1990-02-01" "1990-03-01" ...
There are 13 variables and 435742 row of observations in total.
summary(df)
## stn_code sampling_date state location
## Length:435742 Length:435742 Length:435742 Length:435742
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## agency type so2 no2
## Length:435742 Length:435742 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 5.00 1st Qu.: 14.00
## Mode :character Mode :character Median : 8.00 Median : 22.00
## Mean : 10.83 Mean : 25.81
## 3rd Qu.: 13.70 3rd Qu.: 32.20
## Max. :909.00 Max. :876.00
## NA's :34646 NA's :16233
## rspm spm location_monitoring_station pm2_5
## Min. : 0.0 Min. : 0.0 Length:435742 Min. : 3.0
## 1st Qu.: 56.0 1st Qu.: 111.0 Class :character 1st Qu.: 24.0
## Median : 90.0 Median : 187.0 Mode :character Median : 32.0
## Mean : 108.8 Mean : 220.8 Mean : 40.8
## 3rd Qu.: 142.0 3rd Qu.: 296.0 3rd Qu.: 46.0
## Max. :6307.0 Max. :3380.0 Max. :504.0
## NA's :40222 NA's :237387 NA's :426428
## date
## Length:435742
## Class :character
## Mode :character
##
##
##
##
Stn_code, sampling_date, state, location, agency, type, location_monitoring_station and date is labelled as character variables.
so2,no2,rspm,spm and pm2_5 are numerical variables with missing values and maybe outliers because of abnormal large maximum value.
for(i in 1:13){
x=sum(is.na(df[i]))
print(paste(names(df[i]),x))
}
## [1] "stn_code 144077"
## [1] "sampling_date 3"
## [1] "state 0"
## [1] "location 3"
## [1] "agency 149481"
## [1] "type 5393"
## [1] "so2 34646"
## [1] "no2 16233"
## [1] "rspm 40222"
## [1] "spm 237387"
## [1] "location_monitoring_station 27491"
## [1] "pm2_5 426428"
## [1] "date 7"
There are missing value except state variable.
Since “sampling_date” carry same meaning as “date”, we may drop it.
stn_code, agency and location_monitoring_station also seen carry same meaning with location and state, we may drop it too.
df$sampling_date=NULL
df$stn_code=NULL
df$agency=NULL
df$location_monitoring_station=NULL
head(df)
## state location type so2 no2 rspm spm
## 1 Andhra Pradesh Hyderabad Residential, Rural and other Areas 4.8 17.4 NA NA
## 2 Andhra Pradesh Hyderabad Industrial Area 3.1 7.0 NA NA
## 3 Andhra Pradesh Hyderabad Residential, Rural and other Areas 6.2 28.5 NA NA
## 4 Andhra Pradesh Hyderabad Residential, Rural and other Areas 6.3 14.7 NA NA
## 5 Andhra Pradesh Hyderabad Industrial Area 4.7 7.5 NA NA
## 6 Andhra Pradesh Hyderabad Residential, Rural and other Areas 6.4 25.7 NA NA
## pm2_5 date
## 1 NA 1990-02-01
## 2 NA 1990-02-01
## 3 NA 1990-02-01
## 4 NA 1990-03-01
## 5 NA 1990-03-01
## 6 NA 1990-03-01
Let’s observe the other variables one by one.
plotdata <- df %>%
count(state) %>%
mutate(pct = n / sum(n),
pctlabel = paste0(round(pct*100), "%"))
plotdata
## state n pct pctlabel
## 1 andaman-and-nicobar-islands 1 2.294936e-06 0%
## 2 Andhra Pradesh 26368 6.051287e-02 6%
## 3 Arunachal Pradesh 90 2.065442e-04 0%
## 4 Assam 19361 4.443226e-02 4%
## 5 Bihar 2275 5.220979e-03 1%
## 6 Chandigarh 8520 1.955285e-02 2%
## 7 Chhattisgarh 7831 1.797164e-02 2%
## 8 Dadra & Nagar Haveli 634 1.454989e-03 0%
## 9 Daman & Diu 782 1.794640e-03 0%
## 10 Delhi 8551 1.962400e-02 2%
## 11 Goa 6206 1.424237e-02 1%
## 12 Gujarat 21279 4.883394e-02 5%
## 13 Haryana 3420 7.848681e-03 1%
## 14 Himachal Pradesh 22896 5.254485e-02 5%
## 15 Jammu & Kashmir 1289 2.958172e-03 0%
## 16 Jharkhand 5968 1.369618e-02 1%
## 17 Karnataka 17119 3.928701e-02 4%
## 18 Kerala 24728 5.674918e-02 6%
## 19 Lakshadweep 1 2.294936e-06 0%
## 20 Madhya Pradesh 19920 4.571513e-02 5%
## 21 Maharashtra 60384 1.385774e-01 14%
## 22 Manipur 76 1.744151e-04 0%
## 23 Meghalaya 3853 8.842388e-03 1%
## 24 Mizoram 5338 1.225037e-02 1%
## 25 Nagaland 2463 5.652427e-03 1%
## 26 Odisha 19279 4.424407e-02 4%
## 27 Puducherry 3785 8.686333e-03 1%
## 28 Punjab 25634 5.882839e-02 6%
## 29 Rajasthan 25589 5.872512e-02 6%
## 30 Sikkim 1 2.294936e-06 0%
## 31 Tamil Nadu 20597 4.726880e-02 5%
## 32 Telangana 3978 9.129255e-03 1%
## 33 Tripura 1 2.294936e-06 0%
## 34 Uttar Pradesh 42816 9.825998e-02 10%
## 35 Uttarakhand 1961 4.500369e-03 0%
## 36 Uttaranchal 285 6.540568e-04 0%
## 37 West Bengal 22463 5.155115e-02 5%
p1<-ggplot(plotdata, aes(x=reorder(state, n),y=pct)) +
geom_bar(stat = "identity", fill = "lightgreen") +
geom_text(aes(label = pctlabel), vjust = -0.25) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
scale_y_continuous(labels = percent) +
labs(y = "Percentage",x="State", title = "Percentage of Recording From Each State")
p1
Maharashtra have most observations followed by Uttar Pradesh and others.
plotdata <- df %>%
count(location) %>%
mutate(pct = n / sum(n),
pctlabel = paste0(round(pct*100), "%"))
plotdata
## location n pct pctlabel
## 1 Agra 7306 1.676680e-02 2%
## 2 Ahmedabad 6256 1.435712e-02 1%
## 3 Aizawl 3499 8.029981e-03 1%
## 4 Akola 1048 2.405093e-03 0%
## 5 Alappuzha 1512 3.469943e-03 0%
## 6 Allahabad 1718 3.942700e-03 0%
## 7 Alwar 3746 8.596830e-03 1%
## 8 Amlai 119 2.730974e-04 0%
## 9 Amona 299 6.861859e-04 0%
## 10 Amravati 2456 5.636363e-03 1%
## 11 Amritsar 1334 3.061445e-03 0%
## 12 Ananthapur 324 7.435593e-04 0%
## 13 Angul 2357 5.409164e-03 1%
## 14 Ankleshwar 1240 2.845721e-03 0%
## 15 ANKLESHWAR 167 3.832543e-04 0%
## 16 Anklesvar 966 2.216908e-03 0%
## 17 Anpara 1941 4.454471e-03 0%
## 18 Asansol 1609 3.692552e-03 0%
## 19 Assanora 298 6.838909e-04 0%
## 20 Aurangabad 1839 4.220387e-03 0%
## 21 Aurangabad (MS) 1176 2.698845e-03 0%
## 22 Baddi 2965 6.804485e-03 1%
## 23 Badlapur 585 1.342538e-03 0%
## 24 Balasore 1239 2.843426e-03 0%
## 25 Bangalore 6660 1.528427e-02 2%
## 26 Bareilly 760 1.744151e-03 0%
## 27 Baroda 705 1.617930e-03 0%
## 28 Barrackpore 849 1.948401e-03 0%
## 29 Baruipur 45 1.032721e-04 0%
## 30 Bathinda 930 2.134290e-03 0%
## 31 Belgaum 711 1.631699e-03 0%
## 32 Berhampur 808 1.854308e-03 0%
## 33 Bharuch 353 8.101124e-04 0%
## 34 Bhilai 1894 4.346609e-03 0%
## 35 Bhilai Nagar 1261 2.893914e-03 0%
## 36 Bhopal 3449 7.915234e-03 1%
## 37 Bhubaneshwar 1300 2.983417e-03 0%
## 38 Bhubaneswar 2386 5.475717e-03 1%
## 39 Bhuj 24 5.507846e-05 0%
## 40 Bicholim 301 6.907757e-04 0%
## 41 Bidar 88 2.019544e-04 0%
## 42 Bijapur 68 1.560556e-04 0%
## 43 Bilaspur 240 5.507846e-04 0%
## 44 Bombay 423 9.707579e-04 0%
## 45 Bongaigaon 1681 3.857787e-03 0%
## 46 Byrnihat 510 1.170417e-03 0%
## 47 Calcutta 564 1.294344e-03 0%
## 48 Champhai 626 1.436630e-03 0%
## 49 Chandarpur 77 1.767101e-04 0%
## 50 Chandigarh 8520 1.955285e-02 2%
## 51 Chandrapur 4782 1.097438e-02 1%
## 52 Chennai 6646 1.525214e-02 2%
## 53 Chhindwara 125 2.868670e-04 0%
## 54 Chitradurga 296 6.793011e-04 0%
## 55 Chittoor 1003 2.301821e-03 0%
## 56 Chittorgarh 36 8.261770e-05 0%
## 57 Cochin 1015 2.329360e-03 0%
## 58 Codli 301 6.907757e-04 0%
## 59 Coimbatore 3268 7.499851e-03 1%
## 60 Cuddalore 651 1.494003e-03 0%
## 61 Cuncolim 103 2.363784e-04 0%
## 62 Curchorem 307 7.045454e-04 0%
## 63 Cuttack 1874 4.300710e-03 0%
## 64 Daman 647 1.484824e-03 0%
## 65 Daman Diu & Nagar 166 3.809594e-04 0%
## 66 Damtal 2945 6.758587e-03 1%
## 67 DANKUNI 103 2.363784e-04 0%
## 68 Daranga 399 9.156795e-04 0%
## 69 Davangere 701 1.608750e-03 0%
## 70 Dawki 510 1.170417e-03 0%
## 71 Dehradoon 262 6.012732e-04 0%
## 72 Dehradun 1029 2.361489e-03 0%
## 73 Delhi 8551 1.962400e-02 2%
## 74 Dera Baba 608 1.395321e-03 0%
## 75 Dera Bassi 2464 5.654722e-03 1%
## 76 Dewas 1777 4.078101e-03 0%
## 77 Dhanbad 1758 4.034497e-03 0%
## 78 Dharamshala 247 5.668492e-04 0%
## 79 Dharuhera 1 2.294936e-06 0%
## 80 Dharwad 409 9.386288e-04 0%
## 81 Dibrugarh 790 1.812999e-03 0%
## 82 Dimapur 1761 4.041382e-03 0%
## 83 Dombivli 827 1.897912e-03 0%
## 84 Domlur 126 2.891619e-04 0%
## 85 Durgapur 955 2.191664e-03 0%
## 86 Durgapur (WB) 1375 3.155537e-03 0%
## 87 Eluru 300 6.884808e-04 0%
## 88 Faridabad 1858 4.263991e-03 0%
## 89 Faridkot 97 2.226088e-04 0%
## 90 Firozabad 2620 6.012732e-03 1%
## 91 Gajraula 1508 3.460763e-03 0%
## 92 Gajroula 217 4.980011e-04 0%
## 93 Gangtok 1 2.294936e-06 0%
## 94 Ghaziabad 2056 4.718388e-03 0%
## 95 Gobindgarh 3976 9.124666e-03 1%
## 96 Golaghat 707 1.622520e-03 0%
## 97 Gorakhpur 670 1.537607e-03 0%
## 98 Greater Mumbai 689 1.581211e-03 0%
## 99 Gulbarga 767 1.760216e-03 0%
## 100 Guntur 629 1.443515e-03 0%
## 101 Guwahati 9984 2.291264e-02 2%
## 102 Gwalior 1296 2.974237e-03 0%
## 103 Hailakandi 231 5.301302e-04 0%
## 104 Haldia 2381 5.464243e-03 1%
## 105 HALDIA 104 2.386733e-04 0%
## 106 Haldwani 254 5.829137e-04 0%
## 107 Haridwar 299 6.861859e-04 0%
## 108 Hassan 1024 2.350014e-03 0%
## 109 Hisar 1012 2.322475e-03 0%
## 110 Honda 300 6.884808e-04 0%
## 111 Hoshiarpur 88 2.019544e-04 0%
## 112 Howrah 3601 8.264065e-03 1%
## 113 Hubli 411 9.432187e-04 0%
## 114 Hubli-Dharwad 872 2.001184e-03 0%
## 115 Hyderabad 9667 2.218515e-02 2%
## 116 Imphal 76 1.744151e-04 0%
## 117 Indore 3456 7.931299e-03 1%
## 118 Itanagar 45 1.032721e-04 0%
## 119 Jabalpur 984 2.258217e-03 0%
## 120 Jaipur 7850 1.801525e-02 2%
## 121 Jalandhar 3784 8.684038e-03 1%
## 122 Jalgaon 1853 4.252516e-03 0%
## 123 Jalna 905 2.076917e-03 0%
## 124 Jammu 1289 2.958172e-03 0%
## 125 Jamnagar 1015 2.329360e-03 0%
## 126 Jamshedpur 1979 4.541678e-03 0%
## 127 Jhansi 1769 4.059742e-03 0%
## 128 Jharia 1000 2.294936e-03 0%
## 129 Jodhpur 6035 1.384994e-02 1%
## 130 Kadapa 316 7.251998e-04 0%
## 131 Kakinada 288 6.609416e-04 0%
## 132 Kala Amb 1548 3.552561e-03 0%
## 133 Kalinga Nagar 514 1.179597e-03 0%
## 134 Kalyani 104 2.386733e-04 0%
## 135 Kanpur 7545 1.731529e-02 2%
## 136 Karaikal 431 9.891174e-04 0%
## 137 Karimnagar 167 3.832543e-04 0%
## 138 Kashipur 236 5.416049e-04 0%
## 139 Keonjhar 76 1.744151e-04 0%
## 140 Khadoli 309 7.091352e-04 0%
## 141 Khajuraho 16 3.671898e-05 0%
## 142 Khammam 698 1.601865e-03 0%
## 143 Khanna 2135 4.899688e-03 0%
## 144 Khliehriat 221 5.071809e-04 0%
## 145 Khurja 1302 2.988007e-03 0%
## 146 Kochi 7263 1.666812e-02 2%
## 147 Kohima 702 1.611045e-03 0%
## 148 Kolar 199 4.566923e-04 0%
## 149 Kolasib 623 1.429745e-03 0%
## 150 Kolhapur 3180 7.297896e-03 1%
## 151 Kolkata 7733 1.774674e-02 2%
## 152 Kollam 1467 3.366671e-03 0%
## 153 Konark 78 1.790050e-04 0%
## 154 Korba 3321 7.621482e-03 1%
## 155 Kota 4181 9.595127e-03 1%
## 156 Kothur 96 2.203139e-04 0%
## 157 Kottayam 2395 5.496372e-03 1%
## 158 Kotttayam 144 3.304708e-04 0%
## 159 Kozhikode 2624 6.021912e-03 1%
## 160 Kundaim 207 4.750518e-04 0%
## 161 Kurnool 857 1.966760e-03 0%
## 162 Lakhimpur 503 1.154353e-03 0%
## 163 Latur 1776 4.075806e-03 0%
## 164 Lote 957 2.196254e-03 0%
## 165 Lucknow 6194 1.421483e-02 1%
## 166 Ludhiana 6175 1.417123e-02 1%
## 167 Lunglei 590 1.354012e-03 0%
## 168 Madras 937 2.150355e-03 0%
## 169 Madurai 3038 6.972016e-03 1%
## 170 Mahad 543 1.246150e-03 0%
## 171 Malappuram 672 1.542197e-03 0%
## 172 MALDAH 104 2.386733e-04 0%
## 173 Manali 779 1.787755e-03 0%
## 174 Mandya 304 6.976605e-04 0%
## 175 Mangalore 906 2.079212e-03 0%
## 176 Mapusa 205 4.704619e-04 0%
## 177 Margao 207 4.750518e-04 0%
## 178 Margherita 426 9.776427e-04 0%
## 179 Mathura 78 1.790050e-04 0%
## 180 Meerut 684 1.569736e-03 0%
## 181 Mettur 486 1.115339e-03 0%
## 182 Moradabad 956 2.193959e-03 0%
## 183 MORBI 46 1.055671e-04 0%
## 184 Mormugao 263 6.035682e-04 0%
## 185 Mumbai 2461 5.647837e-03 1%
## 186 Mysore 2632 6.040272e-03 1%
## 187 Nagaon 451 1.035016e-03 0%
## 188 Nagda 3038 6.972016e-03 1%
## 189 Nagpur 7829 1.796705e-02 2%
## 190 Nahan 1164 2.671305e-03 0%
## 191 Naharlagun 45 1.032721e-04 0%
## 192 Nalagarh 852 1.955285e-03 0%
## 193 Nalbari 489 1.122224e-03 0%
## 194 Nalco 108 2.478531e-04 0%
## 195 Nalgonda 941 2.159535e-03 0%
## 196 Nanded 1432 3.286348e-03 0%
## 197 Nashik 5145 1.180745e-02 1%
## 198 Navi Mumbai 5541 1.271624e-02 1%
## 199 Naya Nangal 2479 5.689146e-03 1%
## 200 Nellore 408 9.363339e-04 0%
## 201 Nizamabad 27 6.196327e-05 0%
## 202 Noida 1506 3.456174e-03 0%
## 203 Noida, Ghaziabad 27 6.196327e-05 0%
## 204 Nongstoin 236 5.416049e-04 0%
## 205 Ongole 317 7.274947e-04 0%
## 206 Palakkad 1161 2.664421e-03 0%
## 207 Panaji 933 2.141175e-03 0%
## 208 Panjim 157 3.603050e-04 0%
## 209 Paonta Sahib 3301 7.575584e-03 1%
## 210 Paradeep 385 8.835504e-04 0%
## 211 Parwanoo 3523 8.085060e-03 1%
## 212 Patancheru 913 2.095277e-03 0%
## 213 Pathanamthitta 597 1.370077e-03 0%
## 214 Patiala 1435 3.293233e-03 0%
## 215 Patna 1563 3.586985e-03 0%
## 216 Pithampur 139 3.189961e-04 0%
## 217 Ponda 219 5.025910e-04 0%
## 218 Pondicherry 2913 6.685149e-03 1%
## 219 Pondichery 441 1.012067e-03 0%
## 220 Pune 5179 1.188547e-02 1%
## 221 Puri 58 1.331063e-04 0%
## 222 Rai Bareilly 970 2.226088e-03 0%
## 223 Raichur 132 3.029316e-04 0%
## 224 Raipur 1708 3.919751e-03 0%
## 225 Rajahmundry 311 7.137251e-04 0%
## 226 Rajkot 1858 4.263991e-03 0%
## 227 Ramagundam 724 1.661534e-03 0%
## 228 Ranchi 755 1.732677e-03 0%
## 229 Ranebennur 302 6.930707e-04 0%
## 230 Raniganj 883 2.026428e-03 0%
## 231 Rayagada 2230 5.117707e-03 1%
## 232 Renusagar 666 1.528427e-03 0%
## 233 Rishikesh 215 4.934112e-04 0%
## 234 Roha 254 5.829137e-04 0%
## 235 Rourkela 2964 6.802190e-03 1%
## 236 Rudrapur 213 4.888214e-04 0%
## 237 Sagar 732 1.679893e-03 0%
## 238 Saharanpur 46 1.055671e-04 0%
## 239 Salem 1378 3.162422e-03 0%
## 240 Sambalpur 941 2.159535e-03 0%
## 241 SANAND 23 5.278353e-05 0%
## 242 Sangareddy 649 1.489413e-03 0%
## 243 Sangli 1637 3.756810e-03 0%
## 244 Sangrur 129 2.960467e-04 0%
## 245 Sanguem 204 4.681669e-04 0%
## 246 Sankrail 805 1.847423e-03 0%
## 247 Saraikela Kharsawan 117 2.685075e-04 0%
## 248 Sarigam 24 5.507846e-05 0%
## 249 Satna 1414 3.245039e-03 0%
## 250 Shillong 1876 4.305300e-03 0%
## 251 Shimla 3303 7.580174e-03 1%
## 252 Shimoga 309 7.091352e-04 0%
## 253 Sibsagar 240 5.507846e-04 0%
## 254 Silchar 75 1.721202e-04 0%
## 255 Silcher 853 1.957580e-03 0%
## 256 SILIGURI 104 2.386733e-04 0%
## 257 Silvassa 294 6.747112e-04 0%
## 258 Sindri 920 2.111341e-03 0%
## 259 Singrauli 453 1.039606e-03 0%
## 260 Sivasagar 853 1.957580e-03 0%
## 261 Solapur 2628 6.031092e-03 1%
## 262 South Suburban 1040 2.386733e-03 0%
## 263 Srikakulam 315 7.229048e-04 0%
## 264 Sunder Nagar 1115 2.558854e-03 0%
## 265 Surat 3441 7.896875e-03 1%
## 266 Talcher 1961 4.500369e-03 0%
## 267 Tarapur 592 1.358602e-03 0%
## 268 Tezpur 839 1.925451e-03 0%
## 269 Thane 3620 8.307668e-03 1%
## 270 Thiruvananthapuram 2546 5.842907e-03 1%
## 271 Thissur 461 1.057965e-03 0%
## 272 Thoothukudi 2722 6.246816e-03 1%
## 273 Tilamol 204 4.681669e-04 0%
## 274 Tinsukia 840 1.927746e-03 0%
## 275 Tirupati 986 2.262807e-03 0%
## 276 Trichy 1183 2.714909e-03 0%
## 277 Trivandrum 1935 4.440701e-03 0%
## 278 Trivendrum 424 9.730529e-04 0%
## 279 Tumkur 202 4.635771e-04 0%
## 280 Tura 432 9.914123e-04 0%
## 281 Turicorin 41 9.409238e-05 0%
## 282 Tuticorin 247 5.668492e-04 0%
## 283 Udaipur 3741 8.585356e-03 1%
## 284 Ujjain 2329 5.344906e-03 1%
## 285 Ulhasnagar 950 2.180189e-03 0%
## 286 ULUBERIA 104 2.386733e-04 0%
## 287 Umsning 68 1.560556e-04 0%
## 288 Una 1154 2.648356e-03 0%
## 289 Unnao 388 8.904352e-04 0%
## 290 Usgao 305 6.999555e-04 0%
## 291 Vadodara 2968 6.811370e-03 1%
## 292 Vapi 2169 4.977716e-03 0%
## 293 VAPI 24 5.507846e-05 0%
## 294 Varanasi 1627 3.733861e-03 0%
## 295 Vasco 1393 3.196846e-03 0%
## 296 Vijayawada 2093 4.803301e-03 0%
## 297 Visakhapatnam 7108 1.631241e-02 2%
## 298 Vishakhapatnam 297 6.815960e-04 0%
## 299 Vizianagaram 282 6.471720e-04 0%
## 300 Warangal 630 1.445810e-03 0%
## 301 Wayanad 512 1.175007e-03 0%
## 302 West Singhbhum 151 3.465353e-04 0%
## 303 Yamuna Nagar 329 7.550339e-04 0%
## 304 Yamunanagar 220 5.048859e-04 0%
## 305 <NA> 3 6.884808e-06 0%
p1<-ggplot(plotdata, aes(x = reorder(location, n),y=pct)) +
geom_bar(stat = "identity", fill = "lightgreen") +
geom_text(aes(label = pctlabel), vjust = -0.25) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))+
scale_y_continuous(labels = percent) +
labs(y = "Percentage", x = "Location", title = "Percentage of Recording From Each Location")
p1
There are 305 distinct value for Location variable. Guwahati is the largest class. There are 3 missing value, we may drop it later.
plotdata <- df %>%
count(type) %>%
mutate(pct = n / sum(n),
pctlabel = paste0(round(pct*100), "%"))
plotdata
## type n pct pctlabel
## 1 Industrial 233 0.0005347201 0%
## 2 Industrial Area 96091 0.2205226946 22%
## 3 Industrial Areas 51747 0.1187560529 12%
## 4 Residential 158 0.0003625999 0%
## 5 Residential and others 86791 0.1991797899 20%
## 6 Residential, Rural and other Areas 179014 0.4108256721 41%
## 7 RIRUO 1304 0.0029925965 0%
## 8 Sensitive 495 0.0011359933 0%
## 9 Sensitive Area 8980 0.0206085252 2%
## 10 Sensitive Areas 5536 0.0127047657 1%
## 11 <NA> 5393 0.0123765898 1%
p1<-ggplot(plotdata, aes(x = reorder(type, n), y=pct)) +
geom_bar(stat = "identity", fill = "lightgreen") +
geom_text(aes(label = pctlabel), vjust = -0.25) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
scale_y_continuous(labels = percent) +
labs(y = "Percentage", x="Area Type", title = "Percentage of Recording From Each Type of Area")
p1
We can group the data into “residual”, “industrial” and “others” class.
df$type[df$type %in% "Residential, Rural and other Areas"] <- "Residential"
df$type[df$type %in% "Residential and others"] <- "Residential"
df$type[df$type %in% "Industrial Area"] <- "Industrial"
df$type[df$type %in% "Industrial Areas"] <- "Industrial"
df$type[df$type %in% "RIRUO"] <- "Others"
df$type[df$type %in% "Sensitive"] <- "Others"
df$type[df$type %in% "Sensitive Areas"] <- "Others"
df$type[df$type %in% "Sensitive Area"] <- "Others"
df$type[is.na(df$type)]= "Others"
plotdata <- df %>%
count(type) %>%
mutate(pct = n / sum(n),
pctlabel = paste0(round(pct*100), "%"))
plotdata
## type n pct pctlabel
## 1 Industrial 148071 0.33981347 34%
## 2 Others 21708 0.04981847 5%
## 3 Residential 265963 0.61036806 61%
p1<-ggplot(plotdata, aes(x = reorder(type, n), y=pct)) +
geom_bar(stat = "identity", fill = "lightgreen") +
geom_text(aes(label = pctlabel), vjust = -0.25) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
scale_y_continuous(labels = percent) +
labs(y = "Percentage", x="Area Type", title = "Percentage of Recording From Each Type of Area")
p1
Now there is only 3 groups in area type.
p1<-ggplot(df, aes(x = so2)) +
geom_histogram(binwidth=0.1, colour="black", fill = "lightgreen")+
labs(title = "HIstogram of so2")
p1
## Warning: Removed 34646 rows containing non-finite values (stat_bin).
There are 34646 rows of missing value in “so2”.
We can try focus on range(0~100) for drawing graph.
p1<-ggplot(df, aes(x = so2)) +
geom_histogram(binwidth=0.1, breaks= seq(0,100), colour="black", fill = "lightgreen")+
labs(title = "HIstogram of so2")
p1
## Warning: Removed 34646 rows containing non-finite values (stat_bin).
boxplot(df$so2,
main = "Boxplot for so2",
col = "orange",
border = "brown",
horizontal = TRUE,
notch = TRUE
)
We can observe that the data is right skewed, data scattered around low value of so2 value.
There are outliers in the dataset, we will be cleaning it later.
p1<-ggplot(df, aes(x = no2)) +
geom_histogram(binwidth=1, colour="black", fill = "lightgreen")+
labs(title = "HIstogram of no2")
p1
## Warning: Removed 16233 rows containing non-finite values (stat_bin).
There are 16233 rows of missing value in “no2”.
We can try focus on range(0~200) for drawing graph.
p1<-ggplot(df, aes(x = no2)) +
geom_histogram(binwidth=1, breaks= seq(0,200),colour="black", fill = "lightgreen")+
labs(title = "HIstogram of no2")
p1
## Warning: Removed 16233 rows containing non-finite values (stat_bin).
boxplot(df$no2,
main = "Boxplot for no2",
col = "orange",
border = "brown",
horizontal = TRUE,
notch = TRUE
)
We can observe that the data is right skewed, data scattered around low value of no2 value.
There are outliers in the dataset, we will be cleaning it later.
p1<-ggplot(df, aes(x = rspm)) +
geom_histogram(binwidth=10, colour="black", fill = "lightgreen")+
labs(title = "HIstogram of rspm")
p1
## Warning: Removed 40222 rows containing non-finite values (stat_bin).
There are 40222 rows of missing value in “rspm”.
We can try focus on range(0~1000) for drawing graph.
p1<-ggplot(df, aes(x = rspm)) +
geom_histogram(binwidth=100, breaks= seq(0,1000), fill = "lightgreen")+
labs(title = "HIstogram of rspm")
p1
## Warning: Removed 40222 rows containing non-finite values (stat_bin).
boxplot(df$rspm,
main = "Boxplot for rspm",
col = "orange",
border = "brown",
horizontal = TRUE,
notch = TRUE
)
We can observe that the data is right skewed, data scattered around low value of rspm value.
There are outliers in the dataset, we will be cleaning it later.
p1<-ggplot(df, aes(x = spm)) +
geom_histogram(binwidth=1, colour="black", fill = "lightgreen")+
labs(title = "HIstogram of spm")
p1
## Warning: Removed 237387 rows containing non-finite values (stat_bin).
There are 237387 rows of missing value in “spm”.
We can try focus on range(0~1000) for drawing graph.
p1<-ggplot(df, aes(x = spm)) +
geom_histogram(binwidth=100, breaks= seq(0,1000) ,fill = "lightgreen")+
labs(title = "HIstogram of spm")
p1
## Warning: Removed 237387 rows containing non-finite values (stat_bin).
boxplot(df$spm,
main = "Boxplot for spm",
col = "orange",
border = "brown",
horizontal = TRUE,
notch = TRUE
)
We can observe that the data is right skewed, data scattered around low value of spm value.
There are outliers in the dataset, we will be cleaning it later.
##pm2_5
p1<-ggplot(df, aes(x = pm2_5)) +
geom_histogram(binwidth=0.5, colour="black", fill = "lightgreen")+
labs(title = "HIstogram of pm2.5")
p1
## Warning: Removed 426428 rows containing non-finite values (stat_bin).
There are 426428 rows of missing value in “pm2_5”.
We can try focus on range(0~300) for drawing graph.
p1<-ggplot(df, aes(x = pm2_5)) +
geom_histogram(binwidth=0.5, breaks= seq(0,300), colour="black", fill = "lightgreen")+
labs(title = "HIstogram of pm2.5")
p1
## Warning: Removed 426428 rows containing non-finite values (stat_bin).
boxplot(df$pm2_5,
main = "Boxplot for pm2_5",
col = "orange",
border = "brown",
horizontal = TRUE,
notch = TRUE
)
We can observe that the data is right skewed, data scattered around low value of pm2_5 value.
There are outliers in the dataset, we will be cleaning it later.
df <- df %>% mutate(so2 = if_else(so2>500, mean(df$so2), df$so2))
df <- df %>% mutate(no2 = if_else(no2>500, mean(df$no2), df$no2))
df <- df %>% mutate(rspm = if_else(rspm>2000, mean(df$rspm), df$rspm))
df <- df %>% mutate(spm = if_else(spm>2000, mean(df$spm), df$spm))
df <- df %>% mutate(pm2_5 = if_else(pm2_5>350, mean(df$pm2_5), df$pm2_5))
par(mfrow=c(1,5))
boxplot(df$so2,
main = "Boxplot for so2",
col = "orange",
border = "brown",
notch = TRUE)
boxplot(df$no2,
main = "Boxplot for no2",
col = "orange",
border = "brown",
notch = TRUE)
boxplot(df$rspm,
main = "Boxplot for rspm",
col = "orange",
border = "brown",
notch = TRUE)
boxplot(df$spm,
main = "Boxplot for spm",
col = "orange",
border = "brown",
notch = TRUE)
boxplot(df$pm2_5,
main = "Boxplot for pm2_5",
col = "orange",
border = "brown",
notch = TRUE)
The outliers has been removed.
Because we want to build two models,
time series regression for predicting pm2.5 for year 2014 and 2015.
clustering using all data. so, we first make a subset for first model and clean it.
df_ts=subset(df, !is.na(pm2_5))
head(df_ts)
## state location type so2 no2 rspm spm pm2_5
## 65037 Dadra & Nagar Haveli Khadoli Industrial 18 31 104 NA 35
## 65038 Dadra & Nagar Haveli Khadoli Industrial 14 26 94 NA 32
## 65039 Dadra & Nagar Haveli Khadoli Industrial 16 28 99 NA 35
## 65040 Dadra & Nagar Haveli Khadoli Industrial 13 23 82 NA 24
## 65041 Dadra & Nagar Haveli Khadoli Industrial 14 29 93 NA 32
## 65042 Dadra & Nagar Haveli Khadoli Industrial 13 25 77 NA 22
## date
## 65037 2015-08-06
## 65038 2015-08-10
## 65039 2015-08-13
## 65040 2015-08-20
## 65041 2015-08-24
## 65042 2015-08-26
str(df_ts)
## 'data.frame': 9312 obs. of 9 variables:
## $ state : chr "Dadra & Nagar Haveli" "Dadra & Nagar Haveli" "Dadra & Nagar Haveli" "Dadra & Nagar Haveli" ...
## $ location: chr "Khadoli" "Khadoli" "Khadoli" "Khadoli" ...
## $ type : chr "Industrial" "Industrial" "Industrial" "Industrial" ...
## $ so2 : num 18 14 16 13 14 13 12 15 14 16 ...
## $ no2 : num 31 26 28 23 29 25 21 26 25 26 ...
## $ rspm : num 104 94 99 82 93 77 81 90 89 100 ...
## $ spm : num NA NA NA NA NA NA NA NA NA NA ...
## $ pm2_5 : num 35 32 35 24 32 22 27 29 27 34 ...
## $ date : chr "2015-08-06" "2015-08-10" "2015-08-13" "2015-08-20" ...
There is only 9314 rows that have pm2.5 value.
for(i in 1:9){
x=sum(is.na(df_ts[i]))
print(paste(names(df_ts[i]),x))}
## [1] "state 0"
## [1] "location 0"
## [1] "type 0"
## [1] "so2 119"
## [1] "no2 96"
## [1] "rspm 60"
## [1] "spm 9312"
## [1] "pm2_5 0"
## [1] "date 0"
so2,no2,rspm have missing values. spm is blank.
df_ts$so2[is.na(df_ts$so2)]= mean(df_ts$so2, na.rm = TRUE)
df_ts$no2[is.na(df_ts$no2)]= mean(df_ts$no2, na.rm = TRUE)
df_ts$rspm[is.na(df_ts$rspm)]= mean(df_ts$rspm, na.rm = TRUE)
df_ts$spm=NULL
for(i in 1:8){
x=sum(is.na(df_ts[i]))
print(paste(names(df_ts[i]),x))}
## [1] "state 0"
## [1] "location 0"
## [1] "type 0"
## [1] "so2 0"
## [1] "no2 0"
## [1] "rspm 0"
## [1] "pm2_5 0"
## [1] "date 0"
There is no missing value now and we can export this subset out for visualization and modelling of time series regression.
df_ts=df_ts[(order(df_ts$date,decreasing=FALSE)),]
head(df_ts)
## state location type so2 no2 rspm pm2_5 date
## 193521 Madhya Pradesh Bhopal Residential 2.0 18.0 198 81 2014-01-01
## 284916 Odisha Cuttack Residential 2.0 37.0 151 95 2014-01-01
## 365619 Telangana Sangareddy Industrial 1.8 66.3 99 54 2014-01-01
## 193352 Madhya Pradesh Bhopal Industrial 2.0 21.0 215 96 2014-01-02
## 284925 Odisha Cuttack Residential 2.0 34.0 142 82 2014-01-02
## 365650 Telangana Sangareddy Industrial 12.4 86.3 149 78 2014-01-02
str(df_ts)
## 'data.frame': 9312 obs. of 8 variables:
## $ state : chr "Madhya Pradesh" "Odisha" "Telangana" "Madhya Pradesh" ...
## $ location: chr "Bhopal" "Cuttack" "Sangareddy" "Bhopal" ...
## $ type : chr "Residential" "Residential" "Industrial" "Industrial" ...
## $ so2 : num 2 2 1.8 2 2 ...
## $ no2 : num 18 37 66.3 21 34 86.3 23 29 38.4 23 ...
## $ rspm : num 198 151 99 215 142 149 155 96 51 88 ...
## $ pm2_5 : num 81 95 54 96 82 78 52 60 24 24 ...
## $ date : chr "2014-01-01" "2014-01-01" "2014-01-01" "2014-01-02" ...
#write.csv(df_ts,"data_ts.csv", row.names=FALSE)
#check the data of noPmNA
table(df_ts$state)
##
## Dadra & Nagar Haveli Daman & Diu Delhi
## 43 44 371
## Goa Gujarat Madhya Pradesh
## 1390 2401 828
## Odisha Tamil Nadu Telangana
## 2787 454 354
## West Bengal
## 640
#change the type of "date" to date for the convenience of drawing
df_ts$date<-as.Date(df_ts$date)
#split the dataset
splitByState <- split(df_ts, df_ts$state)
str(splitByState)
## List of 10
## $ Dadra & Nagar Haveli:'data.frame': 43 obs. of 8 variables:
## ..$ state : chr [1:43] "Dadra & Nagar Haveli" "Dadra & Nagar Haveli" "Dadra & Nagar Haveli" "Dadra & Nagar Haveli" ...
## ..$ location: chr [1:43] "Khadoli" "Khadoli" "Khadoli" "Khadoli" ...
## ..$ type : chr [1:43] "Industrial" "Industrial" "Industrial" "Industrial" ...
## ..$ so2 : num [1:43] 18 14 16 13 14 13 12 15 14 16 ...
## ..$ no2 : num [1:43] 31 26 28 23 29 25 21 26 25 26 ...
## ..$ rspm : num [1:43] 104 94 99 82 93 77 81 90 89 100 ...
## ..$ pm2_5 : num [1:43] 35 32 35 24 32 22 27 29 27 34 ...
## ..$ date : Date[1:43], format: "2015-08-06" "2015-08-10" ...
## $ Daman & Diu :'data.frame': 44 obs. of 8 variables:
## ..$ state : chr [1:44] "Daman & Diu" "Daman & Diu" "Daman & Diu" "Daman & Diu" ...
## ..$ location: chr [1:44] "Daman" "Daman" "Daman" "Daman" ...
## ..$ type : chr [1:44] "Industrial" "Industrial" "Industrial" "Industrial" ...
## ..$ so2 : num [1:44] 12 14 14 12 13 14 13 12 13 14 ...
## ..$ no2 : num [1:44] 22 24 26 21 23 25 22 21 23 25 ...
## ..$ rspm : num [1:44] 94 90 73 81 76 89 93 87 87 89 ...
## ..$ pm2_5 : num [1:44] 30 28 23 22 28 27 30 26 29 28 ...
## ..$ date : Date[1:44], format: "2015-08-06" "2015-08-10" ...
## $ Delhi :'data.frame': 371 obs. of 8 variables:
## ..$ state : chr [1:371] "Delhi" "Delhi" "Delhi" "Delhi" ...
## ..$ location: chr [1:371] "Delhi" "Delhi" "Delhi" "Delhi" ...
## ..$ type : chr [1:371] "Industrial" "Residential" "Industrial" "Residential" ...
## ..$ so2 : num [1:371] 4 4 4 4 4 4 4 4 4 4 ...
## ..$ no2 : num [1:371] 67 35 55 51 61 47 38 65 57 61 ...
## ..$ rspm : num [1:371] 397 221 250 192 289 143 171 344 197 425 ...
## ..$ pm2_5 : num [1:371] 110 141 155 86 151 62 139 163 126 147 ...
## ..$ date : Date[1:371], format: "2015-01-01" "2015-01-05" ...
## $ Goa :'data.frame': 1390 obs. of 8 variables:
## ..$ state : chr [1:1390] "Goa" "Goa" "Goa" "Goa" ...
## ..$ location: chr [1:1390] "Curchorem" "Margao" "Mapusa" "Ponda" ...
## ..$ type : chr [1:1390] "Residential" "Residential" "Residential" "Residential" ...
## ..$ so2 : num [1:1390] 4 5 9 4 4 15 4 4 5 4 ...
## ..$ no2 : num [1:1390] 9 9 11 9 9 9 8 9 10 9 ...
## ..$ rspm : num [1:1390] 70 94 72 60 68 77 44 77 88 63 ...
## ..$ pm2_5 : num [1:1390] 27 35 36 24 25 29 13 28 34 22 ...
## ..$ date : Date[1:1390], format: "2015-01-02" "2015-01-02" ...
## $ Gujarat :'data.frame': 2401 obs. of 8 variables:
## ..$ state : chr [1:2401] "Gujarat" "Gujarat" "Gujarat" "Gujarat" ...
## ..$ location: chr [1:2401] "Jamnagar" "Ahmedabad" "Ahmedabad" "Ahmedabad" ...
## ..$ type : chr [1:2401] "Residential" "Industrial" "Residential" "Residential" ...
## ..$ so2 : num [1:2401] 19.5 15 14 12 13 ...
## ..$ no2 : num [1:2401] 23 18.9 18 18 20 ...
## ..$ rspm : num [1:2401] 88 73.3 65 67 67 ...
## ..$ pm2_5 : num [1:2401] 24 26 27 24 29 20 21 22 34 24 ...
## ..$ date : Date[1:2401], format: "2014-01-04" "2014-01-05" ...
## $ Madhya Pradesh :'data.frame': 828 obs. of 8 variables:
## ..$ state : chr [1:828] "Madhya Pradesh" "Madhya Pradesh" "Madhya Pradesh" "Madhya Pradesh" ...
## ..$ location: chr [1:828] "Bhopal" "Bhopal" "Bhopal" "Bhopal" ...
## ..$ type : chr [1:828] "Residential" "Industrial" "Industrial" "Residential" ...
## ..$ so2 : num [1:828] 2 2 2 2 2 2 2 3 2 2 ...
## ..$ no2 : num [1:828] 18 21 23 36 23 12 12 23 29 23 ...
## ..$ rspm : num [1:828] 198 215 155 79 155 143 128 246 125 121 ...
## ..$ pm2_5 : num [1:828] 81 96 52 68 140 209 98 148 49 85 ...
## ..$ date : Date[1:828], format: "2014-01-01" "2014-01-02" ...
## $ Odisha :'data.frame': 2787 obs. of 8 variables:
## ..$ state : chr [1:2787] "Odisha" "Odisha" "Odisha" "Odisha" ...
## ..$ location: chr [1:2787] "Cuttack" "Cuttack" "Cuttack" "Cuttack" ...
## ..$ type : chr [1:2787] "Residential" "Residential" "Residential" "Residential" ...
## ..$ so2 : num [1:2787] 2 2 2 2 2 2 2 2 12 13 ...
## ..$ no2 : num [1:2787] 37 34 29 25 30 24 14 24 11 12 ...
## ..$ rspm : num [1:2787] 151 142 96 71 95 54 50 38 37 56 ...
## ..$ pm2_5 : num [1:2787] 95 82 60 48 43 35 14 21 18 22 ...
## ..$ date : Date[1:2787], format: "2014-01-01" "2014-01-02" ...
## $ Tamil Nadu :'data.frame': 454 obs. of 8 variables:
## ..$ state : chr [1:454] "Tamil Nadu" "Tamil Nadu" "Tamil Nadu" "Tamil Nadu" ...
## ..$ location: chr [1:454] "Thoothukudi" "Chennai" "Chennai" "Madurai" ...
## ..$ type : chr [1:454] "Residential" "Industrial" "Residential" "Industrial" ...
## ..$ so2 : num [1:454] 14 14 18 16 15 13 14 16 14 14 ...
## ..$ no2 : num [1:454] 14 15 21 26 34 16 15 22 13 20 ...
## ..$ rspm : num [1:454] 37 60 51 62 56 48 60 46 42 54 ...
## ..$ pm2_5 : num [1:454] 60 47 25 33.6 27 28 50 21 58 17 ...
## ..$ date : Date[1:454], format: "2015-03-02" "2015-03-03" ...
## $ Telangana :'data.frame': 354 obs. of 8 variables:
## ..$ state : chr [1:354] "Telangana" "Telangana" "Telangana" "Telangana" ...
## ..$ location: chr [1:354] "Sangareddy" "Sangareddy" "Sangareddy" "Sangareddy" ...
## ..$ type : chr [1:354] "Industrial" "Industrial" "Industrial" "Industrial" ...
## ..$ so2 : num [1:354] 1.8 12.4 2.8 8.7 2 0.3 3.4 3.03 3.5 6.5 ...
## ..$ no2 : num [1:354] 66.3 86.3 38.4 55.4 46.1 62.2 44.8 31.5 26.1 45.3 ...
## ..$ rspm : num [1:354] 99 149 51 128 33 105 100 33 21 106 ...
## ..$ pm2_5 : num [1:354] 54 78 24 56 20 35 24 10 6 49 ...
## ..$ date : Date[1:354], format: "2014-01-01" "2014-01-02" ...
## $ West Bengal :'data.frame': 640 obs. of 8 variables:
## ..$ state : chr [1:640] "West Bengal" "West Bengal" "West Bengal" "West Bengal" ...
## ..$ location: chr [1:640] "Howrah" "Durgapur" "Kolkata" "Kolkata" ...
## ..$ type : chr [1:640] "Residential" "Residential" "Others" "Industrial" ...
## ..$ so2 : num [1:640] 10 7 2 3 7 9 7 12 9 10 ...
## ..$ no2 : num [1:640] 40 57 46 54 58 69 69 40 71 74 ...
## ..$ rspm : num [1:640] 138 82 77 104 86 93 214 132 217 92 ...
## ..$ pm2_5 : num [1:640] 83 52 36 47 60 63 94 76 109 48 ...
## ..$ date : Date[1:640], format: "2015-01-02" "2015-01-02" ...
library(ggplot2)
for (i in 1:length(splitByState)){
loopdata<-splitByState[[i]]
#print(loopdata)
#print(as.character(names(splitByState[i])))
#draw graph
print(ggplot() +geom_line(data = loopdata,aes(x = date,y = pm2_5,colour = "pm2_5"),size=1) +
geom_line(data = loopdata,aes(x = date,y = so2,colour = "so2"),size=1) +
geom_line(data = loopdata,aes(x = date,y = no2,colour = "no2"),size=1) +
geom_line(data = loopdata,aes(x = date,y = rspm,colour = "rspm"),size=1) +
ggtitle(paste("State:",as.character(names(splitByState[i])))) +
theme(plot.title = element_text(hjust = 0.5)) +
scale_colour_manual("",values = c("pm2_5" = "red","so2"="green","no2"="blue","rspm"="yellow"))+
xlab("Date")+ylab("Pollution"))
}
df_c=df
data_c_path = paste0(datadir, "/data_c.csv")
write.csv(df_c, data_c_path, row.names=FALSE)
df_c$pm2_5=NULL
df_c$date=NULL
head(df_c)
## state location type so2 no2 rspm spm
## 1 Andhra Pradesh Hyderabad Residential 4.8 17.4 NA NA
## 2 Andhra Pradesh Hyderabad Industrial 3.1 7.0 NA NA
## 3 Andhra Pradesh Hyderabad Residential 6.2 28.5 NA NA
## 4 Andhra Pradesh Hyderabad Residential 6.3 14.7 NA NA
## 5 Andhra Pradesh Hyderabad Industrial 4.7 7.5 NA NA
## 6 Andhra Pradesh Hyderabad Residential 6.4 25.7 NA NA
We dropped pm2.5 because it is the target variable and it is only avaliable for last 2 year. We dropped date because clustering model no need time variable.
for(i in 1:7){
x=sum(is.na(df_c[i]))
print(paste(names(df_c[i]),x))}
## [1] "state 0"
## [1] "location 3"
## [1] "type 0"
## [1] "so2 34648"
## [1] "no2 16239"
## [1] "rspm 40223"
## [1] "spm 237394"
location, so2,no2,rspm and spm have missing values
df_c$so2[is.na(df$so2)]= mean(df_c$so2, na.rm = TRUE)
df_c$no2[is.na(df$no2)]= mean(df_c$no2, na.rm = TRUE)
df_c$rspm[is.na(df$rspm)]= mean(df_c$rspm, na.rm = TRUE)
df_c$spm[is.na(df$spm)]= mean(df_c$spm, na.rm = TRUE)
df_c <- df_c %>% na.omit(df_c)
for(i in 1:7){
x=sum(is.na(df_c[i]))
print(paste(names(df_c[i]),x))}
## [1] "state 0"
## [1] "location 0"
## [1] "type 0"
## [1] "so2 0"
## [1] "no2 0"
## [1] "rspm 0"
## [1] "spm 0"
We mutate missing value in numerical variable using mean. 3 missing value for location, we just drop. There is no more missing values in the dataset, it is ready to export.
head(df_c)
## state location type so2 no2 rspm spm
## 1 Andhra Pradesh Hyderabad Residential 4.8 17.4 108.8171 220.7047
## 2 Andhra Pradesh Hyderabad Industrial 3.1 7.0 108.8171 220.7047
## 3 Andhra Pradesh Hyderabad Residential 6.2 28.5 108.8171 220.7047
## 4 Andhra Pradesh Hyderabad Residential 6.3 14.7 108.8171 220.7047
## 5 Andhra Pradesh Hyderabad Industrial 4.7 7.5 108.8171 220.7047
## 6 Andhra Pradesh Hyderabad Residential 6.4 25.7 108.8171 220.7047
str(df_c)
## 'data.frame': 435739 obs. of 7 variables:
## $ state : chr "Andhra Pradesh" "Andhra Pradesh" "Andhra Pradesh" "Andhra Pradesh" ...
## $ location: chr "Hyderabad" "Hyderabad" "Hyderabad" "Hyderabad" ...
## $ type : chr "Residential" "Industrial" "Residential" "Residential" ...
## $ so2 : num 4.8 3.1 6.2 6.3 4.7 6.4 5.4 4.7 4.2 4 ...
## $ no2 : num 17.4 7 28.5 14.7 7.5 25.7 17.1 8.7 23 8.9 ...
## $ rspm : num 109 109 109 109 109 ...
## $ spm : num 221 221 221 221 221 ...
## - attr(*, "na.action")= 'omit' Named int [1:3] 435740 435741 435742
## ..- attr(*, "names")= chr [1:3] "435740" "435741" "435742"
#data_c_path = paste0(datadir, "/data_c.csv")
#write.csv(df_c, data_c_path, row.names=FALSE)
YLT: I commenting out this plot coz it takes too long to knit. Only include it back in final version.
#pairs(~ so2 + no2 + rspm + spm, data=df_c, col = 'blue')
#plot(df_c)
RSPM and SPM seem to have slightly stronger positive relationship while the others variables seem like having poor positive relationship.
plotdata <- df_c %>%
group_by(state) %>%
summarize(mean_1 = mean(so2))
plotdata
## # A tibble: 34 × 2
## state mean_1
## <chr> <dbl>
## 1 Andhra Pradesh 7.38
## 2 Arunachal Pradesh 5.13
## 3 Assam 6.75
## 4 Bihar 18.8
## 5 Chandigarh 6.60
## 6 Chhattisgarh 12.7
## 7 Dadra & Nagar Haveli 8.95
## 8 Daman & Diu 8.20
## 9 Delhi 8.92
## 10 Goa 7.51
## # … with 24 more rows
ggplot(plotdata, aes(x = reorder(state, mean_1), y = mean_1)) +
geom_bar(stat = "identity", fill = "cornflowerblue") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(title = "So2 by state", y = "Mean of SO2", x ="State")
Jharkhand have the highest SO2 concentration in the air followed by Uttaranchal and Uttarakhand.
plotdata <- df_c %>%
group_by(type) %>%
summarize(mean_1 = mean(so2))
plotdata
## # A tibble: 3 × 2
## type mean_1
## <chr> <dbl>
## 1 Industrial 13.2
## 2 Others 9.21
## 3 Residential 9.63
ggplot(plotdata, aes(x = reorder(type, mean_1), y = mean_1)) +
geom_bar(stat = "identity", fill = "cornflowerblue") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(title = "So2 by area type", y = "Mean of SO2", x ="Area Type")
Industrial area have highest So2 concentration.
plotdata <- df_c %>%
group_by(state) %>%
summarize(mean_1 = mean(no2))
plotdata
## # A tibble: 34 × 2
## state mean_1
## <chr> <dbl>
## 1 Andhra Pradesh 21.8
## 2 Arunachal Pradesh 10.9
## 3 Assam 14.8
## 4 Bihar 36.2
## 5 Chandigarh 19.4
## 6 Chhattisgarh 24.9
## 7 Dadra & Nagar Haveli 18.4
## 8 Daman & Diu 16.2
## 9 Delhi 51.7
## 10 Goa 13.5
## # … with 24 more rows
ggplot(plotdata, aes(x = reorder(state, mean_1), y = mean_1)) +
geom_bar(stat = "identity", fill = "cornflowerblue") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(title = "No2 by state", y = "Mean of NO2", x ="State")
West Bengal have the highest NO2 concentration in the air followed by Delhi and Jharkhand.
plotdata <- df_c %>%
group_by(type) %>%
summarize(mean_1 = mean(no2))
plotdata
## # A tibble: 3 × 2
## type mean_1
## <chr> <dbl>
## 1 Industrial 29.3
## 2 Others 22.7
## 3 Residential 24.1
ggplot(plotdata, aes(x = reorder(type, mean_1), y = mean_1)) +
geom_bar(stat = "identity", fill = "cornflowerblue") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(title = "No2 by area type", y = "Mean of NO2", x ="Area Type")
Industrial area have highest No2 concentration.
plotdata <- df_c %>%
group_by(state) %>%
summarize(mean_1 = mean(rspm))
plotdata
## # A tibble: 34 × 2
## state mean_1
## <chr> <dbl>
## 1 Andhra Pradesh 79.5
## 2 Arunachal Pradesh 77.0
## 3 Assam 93.9
## 4 Bihar 118.
## 5 Chandigarh 97.1
## 6 Chhattisgarh 124.
## 7 Dadra & Nagar Haveli 86.5
## 8 Daman & Diu 89.1
## 9 Delhi 177.
## 10 Goa 63.8
## # … with 24 more rows
ggplot(plotdata, aes(x = reorder(state, mean_1), y = mean_1)) +
geom_bar(stat = "identity", fill = "cornflowerblue") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(title = "RSPM by state", y = "Mean of RSPM", x ="State")
Punjab have the highest RSPM index followed by Delhi and Uttar Pradesh.
plotdata <- df_c %>%
group_by(type) %>%
summarize(mean_1 = mean(rspm))
plotdata
## # A tibble: 3 × 2
## type mean_1
## <chr> <dbl>
## 1 Industrial 121.
## 2 Others 101.
## 3 Residential 103.
ggplot(plotdata, aes(x = reorder(type, mean_1), y = mean_1)) +
geom_bar(stat = "identity", fill = "cornflowerblue") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(title = "RSPM by area type", y = "Mean of RSPM", x ="Area Type")
Industrial area have highest RSPM index.
plotdata <- df_c %>%
group_by(state) %>%
summarize(mean_1 = mean(spm))
plotdata
## # A tibble: 34 × 2
## state mean_1
## <chr> <dbl>
## 1 Andhra Pradesh 212.
## 2 Arunachal Pradesh 221.
## 3 Assam 200.
## 4 Bihar 265.
## 5 Chandigarh 212.
## 6 Chhattisgarh 226.
## 7 Dadra & Nagar Haveli 187.
## 8 Daman & Diu 166.
## 9 Delhi 335.
## 10 Goa 140.
## # … with 24 more rows
ggplot(plotdata, aes(x = reorder(state, mean_1), y = mean_1)) +
geom_bar(stat = "identity", fill = "cornflowerblue") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(title = "SPM by state", y = "Mean of SPM", x ="State")
Delhi have the highest SPM index followed by Uttar Pradesh and Uttarakhand.
plotdata <- df_c %>%
group_by(type) %>%
summarize(mean_1 = mean(spm))
plotdata
## # A tibble: 3 × 2
## type mean_1
## <chr> <dbl>
## 1 Industrial 231.
## 2 Others 217.
## 3 Residential 215.
ggplot(plotdata, aes(x = reorder(type, mean_1), y = mean_1)) +
geom_bar(stat = "identity", fill = "cornflowerblue") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(title = "SPM by area type", y = "Mean of SPM", x ="Area Type")
Industrial area have highest SPM index.
Besides time series modelling, we also do a clustering to identify cities (location) with similar air quality. We cluster them by using only the latest 1 year (2015-01-01 to 2015-12-31) of data, with the mean value of numerical variables (SO2, NO2, RSPM, SPM, PM2.5) as the features.
First we import relevant library. We then filter the recent data and obtain the features mentioned above:
library(cluster)
library(lubridate)
start_date<- ymd("2015-01-01")
rec_clus_data<- df[df$date > start_date, ]
### remove rows with missing location
rec_clus_data<- rec_clus_data[!is.na(rec_clus_data$location), ]
loc_data<- rec_clus_data %>% group_by(location) %>%
summarise(mean_so2 = mean(so2, na.rm=TRUE),
mean_no2 = mean(no2, na.rm=TRUE),
mean_rspm = mean(rspm, na.rm=TRUE),
mean_spm = mean(spm, na.rm=TRUE),
mean_pm2_5 = mean(pm2_5, na.rm=TRUE)
) %>% data.frame()
head(loc_data)
## location mean_so2 mean_no2 mean_rspm mean_spm mean_pm2_5
## 1 Agra 3.689055 20.822761 180.28234 NaN NaN
## 2 Ahmedabad 13.230435 20.847826 89.25217 NaN 28.63913
## 3 Aizawl 2.001890 8.385633 44.10208 NaN NaN
## 4 Akola 7.247191 8.378277 127.19476 NaN NaN
## 5 Alappuzha 2.000000 5.000000 45.33613 NaN NaN
## 6 Allahabad 3.608511 28.486692 250.65399 NaN NaN
Next we check and handle the missing values.
num_loc = nrow(loc_data)
for (col in colnames(loc_data) ){
num_na<- sum(is.na(loc_data[col]))
perc_na<- round(100*num_na/num_loc, digits=2)
print(paste(col, num_na, perc_na))
}
## [1] "location 0 0"
## [1] "mean_so2 4 1.57"
## [1] "mean_no2 3 1.18"
## [1] "mean_rspm 0 0"
## [1] "mean_spm 254 100"
## [1] "mean_pm2_5 191 75.2"
We see that SPM (100%) and PM2.5 (75.2%) have very large amount of missing values. We can only drop these features as imputing them will introduce large bias in the clustering model. For SO2 and NO2, we will impute using mean value.
loc_data$mean_so2[is.na(loc_data$mean_so2)]= mean(loc_data$mean_so2, na.rm = TRUE)
loc_data$mean_no2[is.na(loc_data$mean_no2)]= mean(loc_data$mean_no2, na.rm = TRUE)
#### set location as row names
rownames(loc_data)<- loc_data$location
#### drop unnecessary columns
clean_loc_data<- loc_data %>% select(-c(mean_spm, mean_pm2_5, location))
head(clean_loc_data)
## mean_so2 mean_no2 mean_rspm
## Agra 3.689055 20.822761 180.28234
## Ahmedabad 13.230435 20.847826 89.25217
## Aizawl 2.001890 8.385633 44.10208
## Akola 7.247191 8.378277 127.19476
## Alappuzha 2.000000 5.000000 45.33613
## Allahabad 3.608511 28.486692 250.65399
Before clustering the data, we need to scale the features so that their magnitude are of the same order.
scaled_loc_data<- clean_loc_data
for (mean_col in colnames(scaled_loc_data)){
scaled_loc_data[mean_col]<- scale(scaled_loc_data[mean_col], center=TRUE, scale=TRUE)
}
Now we can do clustering. We use the elbow method to obtain a suitable number of clusters, k.
max_k<- 20
wcss<- rep(NA, max_k-1)
for (k in c(2:max_k)){
k_clus<- kmeans(scaled_loc_data, centers = k, nstart = 10)
wcss[k-1]<- k_clus$tot.withinss
}
wcss_df<- data.frame(k=c(2:max_k), wcss = wcss)
wcss_fig<- wcss_df %>% ggplot(aes(x=k, y=wcss)) +
geom_line() + geom_point()
print(wcss_fig)
Here, we see that at around k=8 the Within Cluster Sum of Squared (WCSS) value decreases slowly. Thus we will pick optimal k as 8 to do the clustering.
best_k<- 8
clus_loc<- kmeans(scaled_loc_data, centers = scaled_loc_data[1:best_k,], nstart = 10)
clus_grp<- data.frame(clus_loc$cluster) %>% setNames("cluster")
merged_res<- merge(clean_loc_data, clus_grp, by ='row.names', all=TRUE)
merged_res$cluster<- as.factor(merged_res$cluster)
### center of each cluster
center_data<- merged_res %>% group_by(cluster) %>%
summarise(num_city = n(),
cen_mean_so2 = mean(mean_so2, na.rm=TRUE),
cen_mean_no2 = mean(mean_no2, na.rm=TRUE),
cen_mean_rspm = mean(mean_rspm, na.rm=TRUE),
) %>% data.frame()
print(center_data)
## cluster num_city cen_mean_so2 cen_mean_no2 cen_mean_rspm
## 1 1 24 8.375138 24.87456 147.06674
## 2 2 36 13.643467 21.64344 85.97458
## 3 3 32 6.353506 25.07242 70.44578
## 4 4 57 4.864980 14.48268 98.23663
## 5 5 60 3.959786 10.03803 50.47906
## 6 6 12 12.169472 36.11782 209.11197
## 7 7 14 25.603259 31.94829 138.60601
## 8 8 19 15.276456 51.08625 109.78509
The above table shows the centroid for each cluster. To better visualise the difference among the clusters, we can plot the box plot of each features for the clusters:
clus_vis_so2<- merged_res %>% ggplot(aes(x=cluster, y=mean_so2)) +
geom_boxplot()
print(clus_vis_so2)
clus_vis_no2<- merged_res %>% ggplot(aes(x=cluster, y=mean_no2)) +
geom_boxplot()
print(clus_vis_no2)
clus_vis_rspm<- merged_res %>% ggplot(aes(x=cluster, y=mean_rspm)) +
geom_boxplot()
print(clus_vis_rspm)
From the plot, we see that cities in cluster 6, 7 and 8 have high values in all 3 features, which represents low air quality. Interestingly, these clusters have significantly high value in exactly 1 of the feature: cluster 6 is high in RSPM, cluster 7 in SO2 and cluster 8 in NO2. For the others, cities in cluster 5 have the best air quality with lowest value in all 3 features. Cluster 2, 3, 4 have moderate value in all 3 features while cluster 1 has moderate value in SO2 and NO2 but with high RSPM.
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the plot. ##
Conclusion (Puvee)